SharpestMinds Project by Angela Teng with the mentorship of Jeremie Harris
Completed:
To Do:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
#import sklearn.cross_validation as cross_validation
#from sklearn.cross_validation import train_test_split
import sklearn.model_selection as model_selection
from sklearn import linear_model
from IPython.display import Image
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydot
import mca
from sklearn.neighbors import KNeighborsClassifier
from random import sample
import os
os.getcwd()
os.chdir('/Users/angelateng/Documents/GitHub/Projects/Covertype_Prediction/Data')
os.getcwd()
data = open("covtype.data")
data
data = pd.read_csv("covtype.data", header=None)
#interesting that read csv also works on .data files!
data.head()
# set column names
cols = ['elevation', 'aspect', 'slope', 'horizontal_distance_to_hydrology',
'vertical_distance_to_hydrology', 'Horizontal_Distance_To_Roadways',
'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points',
'Wilderness_Area_1', 'Wilderness_Area_2', 'Wilderness_Area_3', 'Wilderness_Area_4',
'Soil_Type_1',
'Soil_Type_2',
'Soil_Type_3',
'Soil_Type_4',
'Soil_Type_5',
'Soil_Type_6',
'Soil_Type_7',
'Soil_Type_8',
'Soil_Type_9',
'Soil_Type_10',
'Soil_Type_11',
'Soil_Type_12',
'Soil_Type_13',
'Soil_Type_14',
'Soil_Type_15',
'Soil_Type_16',
'Soil_Type_17',
'Soil_Type_18',
'Soil_Type_19',
'Soil_Type_20',
'Soil_Type_21',
'Soil_Type_22',
'Soil_Type_23',
'Soil_Type_24',
'Soil_Type_25',
'Soil_Type_26',
'Soil_Type_27',
'Soil_Type_28',
'Soil_Type_29',
'Soil_Type_30',
'Soil_Type_31',
'Soil_Type_32',
'Soil_Type_33',
'Soil_Type_34',
'Soil_Type_35',
'Soil_Type_36',
'Soil_Type_37',
'Soil_Type_38',
'Soil_Type_39',
'Soil_Type_40',
'Cover_Type']
#cols
data.columns = cols
data.head()
# check data types
data.dtypes
data.info()
# no null values
Goal: predict cover type given the other variables (7 cover types )
data['Cover_Type'].describe()
#sns.distplot(data['Cover_Type'])
#not sure histogram is what we want, kind of just want a bar chart
sns.set(style="whitegrid")
#ax = sns.barplot(x="Cover_Type", y ="Count", data=data['Cover_Type'])
sns.countplot(data['Cover_Type'])
sns.distplot(data['elevation'])
#measured in meters
sns.distplot(data['aspect'])
#measured in azimuth
sns.distplot(data['slope'])
#measured in degrees
sns.distplot(data['horizontal_distance_to_hydrology'])
#meters
sns.distplot(data['vertical_distance_to_hydrology'])
#meters
#what is a negative vertical distance?
#whyy are sns charts so small
sns.distplot(data['Horizontal_Distance_To_Roadways'])
#meters
sns.distplot(data['Hillshade_9am'])
#index from 0-255
sns.distplot(data['Hillshade_Noon'])
sns.distplot(data['Hillshade_3pm'])
sns.distplot(data['Horizontal_Distance_To_Fire_Points'])
#meters
sns.countplot(data['Wilderness_Area_1'])
sns.countplot(data['Wilderness_Area_2'])
sns.countplot(data['Wilderness_Area_3'])
sns.countplot(data['Wilderness_Area_4'])
#plotting multiple binary categorical variables?
data.groupby('Cover_Type').describe()
data.describe()
#sns.pairplot(data)
#this won't run, too big
#correlation matrix
corrmat = data.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
#elevation seems important
#numbered/labeled corr matrix, to see var importance
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'Cover_Type')['Cover_Type'].index
cm = np.corrcoef(data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
#scatterplot between cover type, slope and elevation
sns.set()
cols2 = ['Cover_Type', 'slope', 'elevation']
sns.pairplot(data[cols2], size = 2.5)
plt.show();
#not 100% sure how to read this to be honest
#based on this plot and the corr matrix, the vars that seem most correlated with covertype are aspect, slope, and elevation
plt.style.use('ggplot')
data['Cover_Type'].plot.hist(alpha=0.5,bins=25)
#data[data['Cover_Type']==1].plot.box()
sns.clustermap(data.corr(),annot=True)
#sns.clustermap(data, metric="correlation")
sns.lmplot(x='Cover_Type',y='elevation',data=data)
#sns.pairplot(data,hue='Cover_Type',palette='rainbow')
#data set too big, doesn't run
#average elevation per cover type
avg_elevation = data["elevation"].mean()
#avg_elevation
#sns.boxplot(x=avg_elevation, y="Cover_Type", data=data,palette='rainbow')
data["elevation"].groupby(data['Cover_Type']).mean()
#Average slope per covertype
data["slope"].groupby(data['Cover_Type']).mean()
#average aspect per covertype
data["aspect"].groupby(data['Cover_Type']).mean()
#horizontal distance to hydrology
data["horizontal_distance_to_hydrology"].groupby(data['Cover_Type']).mean()
#vertical_distance_to_hydrology
data["vertical_distance_to_hydrology"].groupby(data['Cover_Type']).mean()
#sns.boxplot(x="elevation", y="Cover_Type", data=data,palette='rainbow')
#wow none of the viz work.
#data.iplot(kind='scatter',x='elevation',y='Cover_Type',mode='markers',size=10)
#also doesn't run....
#data.iplot(kind='box')
cov_dummy = pd.get_dummies(data['Cover_Type'])
cov_dummy.head()
df4 = pd.concat([cov_dummy, data], axis = 1)
df4.head()
sns.violinplot(data = data, x='Cover_Type', y='elevation')
#sns.swarmplot(data = ds_cat, x='Cover_Type', y='elevation', color = 'k', alpha = 0.6)
#fig = plt.figure()
#ax1 = fig.add_subplot(2,1,1)
#sns.countplot(data = data, x = 'Cover_Type', ax = ax1)
#ax2 = fig.add_subplot(2,1,2)
#sns.boxplot(data = ds_cat, x='Cover_Type', y='elevation' , ax = ax2)
#data.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8)
cov_type_corr = data.corr()['Cover_Type'][:-1]
cov_type_corr
golden_features_list = cov_type_corr[abs(cov_type_corr) > 0.5].sort_values(ascending=False)
print("There is {} strongly correlated values with Cover Type:\n{}".format(len(golden_features_list), golden_features_list))
corr = data.drop('Cover_Type', axis=1).corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr[(corr >= 0.5) | (corr <= -0.4)],
cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
annot=True, annot_kws={"size": 8}, square=True);
# can probably add additional stuff here later on.....
# create a base classifier used to evaluate a subset of attributes
model_1 = LogisticRegression()
# set x and y columns
X = data.loc[:, data.columns != 'Cover_Type']
X.head()
y = data.loc[:, data.columns == 'Cover_Type']
y.head()
# create RFE model and select 10 attributes
rfe = RFE(model_1,3)
#rfe = rfe.fit(dataset.data, dataset.target)
#rfe = rfe.fit(x, y)
# this is not running!! dataset may be too large!!
# summarize the selection of the attributes
#print(rfe.support_)
#print(rfe.ranking_)
"The Recursive Feature Elimination (RFE) method is a feature selection approach. It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute."
# fit an Extra Trees model to the data
model_2 = ExtraTreesClassifier()
model_2.fit(x, y)
# display the relative importance of each attribute
print(model_2.feature_importances_)
x_cols = cols[:54]
#x_cols
feat_impt_val = pd.DataFrame([model_2.feature_importances_], index=None)
feat_impt_val.columns = x_cols
#feat_impt_val.columns = cols
#ValueError: Length mismatch: Expected axis has 54 elements, new values have 55 elements
feat_impt_val
It seems like elevation, aspect, and slope are the three most important features.
# train test split
X_train = x
X_train.head()
Y_train = y
Y_train.head()
X_train.shape
Y_train.shape
#sanity check that the number of samples matches
X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, train_size=0.70, random_state=101)
print ("X_train: ", X_train.head())
print ("y_train: ", y_train.head())
print("X_test: ", X_test.head())
print ("y_test: ", y_test.head())
#linear regression - trying to see which vars are most important, even tho y is categorical.... also should be doing logisitic but how do we do this on categorical data
lm = linear_model.LinearRegression()
model_1 = lm.fit(X_train,y_train)
model_1
predictions = lm.predict(X_train)
print(predictions)
#R squared of the model
lm.score(X_train,y_train)
lm.coef_
lm.intercept_
# WAS DOING THE ABOVE WRONG, THIS IS HOW YOU DO IT:
#X = data.drop(data['Cover_Type'], axis=1)
#y = data['Cover_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
(Since Categorical)
dtree = DecisionTreeClassifier()
dtree
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
#not sure how to read this
features = list(data.columns[:54])
features
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)
#graph = pydot.graph_from_dot_data(dot_data.getvalue())
#Image(graph[0].create_png())
"Multiple Correspondence Analysis (MCA), which is an extension of principal component analysis when the variables to be analyzed are categorical instead of quantitative (which is the case here with your binary variables). See for instance Husson et al. (2010), or Abdi and Valentin (2007)." https://stats.stackexchange.com/questions/159705/would-pca-work-for-boolean-binary-data-types
src_index = (['Cover_Type1'] * 7 + ['Expert 2'] * 9 + ['Expert 3'] * 6)
#drop cols except 3 most "important".... lol bad practice but... just trying stuff
## KNN
scaler = StandardScaler()
scaler.fit(data.drop('Cover_Type',axis=1))
scaled_features = scaler.transform(data.drop('Cover_Type',axis=1))
df_feat = pd.DataFrame(scaled_features,columns=data.columns[:-1])
df_feat.head()
#train test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features,data['Cover_Type'],
test_size=0.30)
#KNN: trying to come up with a model to predict cover_type 1-7 or not. We'll start with k=1.
knn = KNeighborsClassifier(n_neighbors=1)
#knn.fit(X_train,y_train)
#jk nothing works
### Just pick the first 3 variables.... basically do what you're not suppoed to do and drop everything
df2 = data[["Cover_Type","elevation","aspect","slope"]]
df2.head()
scaler.fit(df2.drop('Cover_Type',axis=1))
scaled_features = scaler.transform(df2.drop('Cover_Type',axis=1))
df_feat2 = pd.DataFrame(scaled_features,columns=df2.columns[1:])
df_feat2.head()
X_train, X_test, y_train, y_test = train_test_split(scaled_features,df2['Cover_Type'],
test_size=0.30)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
#pick k val
error_rate = []
# Will take some time
for i in range(1,40):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
# NOW WITH K=20
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=20')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))
#lol still bad
# D Tree with just 3 features
X = df2.drop('Cover_Type',axis=1)
y = df2['Cover_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
dtree = DecisionTreeClassifier()
dtree.fit(X_train,y_train)
predictions = dtree.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
features = list(df2.columns[1:])
features
dot_data = StringIO()
export_graphviz(dtree, out_file=dot_data,feature_names=features,filled=True,rounded=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
Image(graph[0].create_png())
# do you sample before you normalize? and log transform? or after
#dataframe with dummy variables for covertype
df4.head()
df_dummy = df4
#df_dummy.drop(['Cover_Type'], axis=1)
df_dummy = df_dummy.drop(['Cover_Type'], axis=1)
df_dummy.head()
df_dummy.columns
?train_test_split
?sample
?loc
?concatenate
?pd.concat
#sample(df_dummy['1'], 1000)
#sample(df_dummy, 1000)
#df_dummy.sample(1000)
# select a random sample of 1000 entries per cover type
CV1 = df_dummy[df_dummy[1]==1].sample(1000)
CV2 = df_dummy[df_dummy[2]==1].sample(1000)
CV3 = df_dummy[df_dummy[3]==1].sample(1000)
CV4 = df_dummy[df_dummy[4]==1].sample(1000)
CV5 = df_dummy[df_dummy[5]==1].sample(1000)
CV6 = df_dummy[df_dummy[6]==1].sample(1000)
CV7 = df_dummy[df_dummy[7]==1].sample(1000)
CV1.head()
CV2.head()
# concatenate those 1000 entries to get a sample size of 7000
# need to make sure to de-dupe this
sample = pd.concat((CV1, CV2, CV3, CV4, CV5, CV6, CV7), axis=0)
sample.head()
sample.shape
# next we want to do more exploratory analysis on this random sample.
sample.describe()
sns.pairplot(sample)
Data Sources and Documentation:
Extraction:
Exploration:
General Syntax:
Data Sampling:
Feature Selection:
Transformation:
Modeling:
Loading: